Projeto 5 - Mapeando a Ocorrencia do Virus Zika

Para esta analise, vamos usar um conjunto de dados coletados do Portal da Saude do Governo Federal.

http://portalsaude.saude.gov.br/index.php/o-ministerio/principal/leia-mais-o-ministerio/1234-secretaria-svs/vigilancia-de-a-a-z/microcefalia-svs/22705&catid=1234&Itemid=250

http://combateaedes.saude.gov.br/pt/situacao-epidemiologica

Todo o projeto sera descrito de acordo com suas etapas. Os acentos foram ignorados para evitar problemas de interpretacao em diferentes sistemas operacionais.

Etapa 1 - Coletando os dados

# Carregando os pacotes
# devtools::install_github("wch/webshot")
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.4.2
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.4.2
# Listando os arquivos e gerando uma lista com os respctivos nomes
temp_files <- list.files(pattern = ".csv")
temp_files
## [1] "Epidemiological_Bulletin-2016-04-02.csv"
## [2] "Epidemiological_Bulletin-2016-04-23.csv"
## [3] "Epidemiological_Bulletin-2016-04-30.csv"
## [4] "Epidemiological_Bulletin-2016-05-07.csv"
## [5] "Epidemiological_Bulletin-2016-05-14.csv"
## [6] "Epidemiological_Bulletin-2016-05-21.csv"
## [7] "Epidemiological_Bulletin-2016-05-28.csv"
## [8] "Epidemiological_Bulletin-2016-06-11.csv"

Etapa 2 - Organizando os dados

# Carregando todos os arquivos em um unico objeto
myfiles <- lapply(temp_files, read.csv, stringsAsFactors = FALSE) 

# Resumo dos arquivos
str(myfiles, 1)
## List of 8
##  $ :'data.frame':    33 obs. of  9 variables:
##  $ :'data.frame':    33 obs. of  9 variables:
##  $ :'data.frame':    33 obs. of  9 variables:
##  $ :'data.frame':    33 obs. of  9 variables:
##  $ :'data.frame':    33 obs. of  9 variables:
##  $ :'data.frame':    33 obs. of  9 variables:
##  $ :'data.frame':    33 obs. of  9 variables:
##  $ :'data.frame':    33 obs. of  9 variables:
lapply(myfiles, names)[1]
## [[1]]
## [1] "report_date"      "location"         "location_type"   
## [4] "data_field"       "data_field_code"  "time_period"     
## [7] "time_period_type" "value"            "unit"
lapply(myfiles, head,2)[1:2]
## [[1]]
##   report_date        location location_type    data_field data_field_code
## 1  2016-04-02           Norte        region zika_reported          BR0011
## 2  2016-04-02 Brazil-Rondonia         state zika_reported          BR0011
##   time_period time_period_type value  unit
## 1          NA               NA  6295 cases
## 2          NA               NA   618 cases
## 
## [[2]]
##   report_date    location location_type    data_field data_field_code
## 1  2016-04-23       Norte        region zika_reported          BR0011
## 2  2016-04-23 Brazil-Acre         state zika_reported          BR0011
##   time_period time_period_type value  unit
## 1          NA               NA  8545 cases
## 2          NA               NA   716 cases
# Organizando o shape dos dados
brazil <- do.call(rbind, myfiles)
brazil <- brazil %>% 
  mutate(report_date = as.Date(report_date))
## Warning: package 'bindrcpp' was built under R version 3.4.1
# Visualizando o dataset
glimpse(brazil)
## Observations: 264
## Variables: 9
## $ report_date      <date> 2016-04-02, 2016-04-02, 2016-04-02, 2016-04-...
## $ location         <chr> "Norte", "Brazil-Rondonia", "Brazil-Acre", "B...
## $ location_type    <chr> "region", "state", "state", "state", "state",...
## $ data_field       <chr> "zika_reported", "zika_reported", "zika_repor...
## $ data_field_code  <chr> "BR0011", "BR0011", "BR0011", "BR0011", "BR00...
## $ time_period      <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ time_period_type <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ value            <int> 6295, 618, 375, 1520, 44, 771, 74, 2893, 3028...
## $ unit             <chr> "cases", "cases", "cases", "cases", "cases", ...

Etapa 3 - Transformando o dataframe em um objeto dplyr

# Transformando o dataframe um uma tabela dplyr e removendo as colunas 6 a 7
brazil <- brazil %>% select(-(6:7)) 

# Visualizando as primeiras 20 linhas
brazil %>% slice (1:20) 
## # A tibble: 20 x 7
##    report_date                   location location_type    data_field
##         <date>                      <chr>         <chr>         <chr>
##  1  2016-04-02                      Norte        region zika_reported
##  2  2016-04-02            Brazil-Rondonia         state zika_reported
##  3  2016-04-02                Brazil-Acre         state zika_reported
##  4  2016-04-02            Brazil-Amazonas         state zika_reported
##  5  2016-04-02             Brazil-Roraima         state zika_reported
##  6  2016-04-02                Brazil-Para         state zika_reported
##  7  2016-04-02               Brazil-Amapa         state zika_reported
##  8  2016-04-02           Brazil-Tocantins         state zika_reported
##  9  2016-04-02                   Nordeste        region zika_reported
## 10  2016-04-02            Brazil-Maranhao         state zika_reported
## 11  2016-04-02               Brazil-Piaui         state zika_reported
## 12  2016-04-02               Brazil-Ceara         state zika_reported
## 13  2016-04-02 Brazil-Rio_Grande_do_Norte         state zika_reported
## 14  2016-04-02             Brazil-Paraiba         state zika_reported
## 15  2016-04-02          Brazil-Pernambuco         state zika_reported
## 16  2016-04-02             Brazil-Alagoas         state zika_reported
## 17  2016-04-02             Brazil-Sergipe         state zika_reported
## 18  2016-04-02               Brazil-Bahia         state zika_reported
## 19  2016-04-02                    Sudeste        region zika_reported
## 20  2016-04-02        Brazil-Minas_Gerais         state zika_reported
## # ... with 3 more variables: data_field_code <chr>, value <int>,
## #   unit <chr>
# Para cada reporting_date nos temos 5 regioes
brazil %>% filter(location_type == "region")
##    report_date     location location_type    data_field data_field_code
## 1   2016-04-02        Norte        region zika_reported          BR0011
## 2   2016-04-02     Nordeste        region zika_reported          BR0011
## 3   2016-04-02      Sudeste        region zika_reported          BR0011
## 4   2016-04-02          Sul        region zika_reported          BR0011
## 5   2016-04-02 Centro-Oeste        region zika_reported          BR0011
## 6   2016-04-23        Norte        region zika_reported          BR0011
## 7   2016-04-23     Nordeste        region zika_reported          BR0011
## 8   2016-04-23      Sudeste        region zika_reported          BR0011
## 9   2016-04-23          Sul        region zika_reported          BR0011
## 10  2016-04-23 Centro-Oeste        region zika_reported          BR0011
## 11  2016-04-30        Norte        region zika_reported          BR0011
## 12  2016-04-30     Nordeste        region zika_reported          BR0011
## 13  2016-04-30      Sudeste        region zika_reported          BR0011
## 14  2016-04-30          Sul        region zika_reported          BR0011
## 15  2016-04-30 Centro-Oeste        region zika_reported          BR0011
## 16  2016-05-07        Norte        region zika_reported          BR0011
## 17  2016-05-07     Nordeste        region zika_reported          BR0011
## 18  2016-05-07      Sudeste        region zika_reported          BR0011
## 19  2016-05-07          Sul        region zika_reported          BR0011
## 20  2016-05-07 Centro-Oeste        region zika_reported          BR0011
## 21  2016-05-14        Norte        region zika_reported          BR0011
## 22  2016-05-14     Nordeste        region zika_reported          BR0011
## 23  2016-05-14      Sudeste        region zika_reported          BR0011
## 24  2016-05-14          Sul        region zika_reported          BR0011
## 25  2016-05-14 Centro-Oeste        region zika_reported          BR0011
## 26  2016-05-21        Norte        region zika_reported          BR0011
## 27  2016-05-21     Nordeste        region zika_reported          BR0011
## 28  2016-05-21      Sudeste        region zika_reported          BR0011
## 29  2016-05-21          Sul        region zika_reported          BR0011
## 30  2016-05-21 Centro-Oeste        region zika_reported          BR0011
## 31  2016-05-28        Norte        region zika_reported          BR0011
## 32  2016-05-28     Nordeste        region zika_reported          BR0011
## 33  2016-05-28      Sudeste        region zika_reported          BR0011
## 34  2016-05-28          Sul        region zika_reported          BR0011
## 35  2016-05-28 Centro-Oeste        region zika_reported          BR0011
## 36  2016-06-11        Norte        region zika_reported          BR0011
## 37  2016-06-11     Nordeste        region zika_reported          BR0011
## 38  2016-06-11      Sudeste        region zika_reported          BR0011
## 39  2016-06-11          Sul        region zika_reported          BR0011
## 40  2016-06-11 Centro-Oeste        region zika_reported          BR0011
##    value  unit
## 1   6295 cases
## 2  30286 cases
## 3  35505 cases
## 4   1797 cases
## 5  17504 cases
## 6   8545 cases
## 7  43000 cases
## 8  46318 cases
## 9   2197 cases
## 10 20101 cases
## 11  8379 cases
## 12 47709 cases
## 13 48027 cases
## 14  2343 cases
## 15 21364 cases
## 16  8053 cases
## 17 51065 cases
## 18 54803 cases
## 19  2431 cases
## 20 21756 cases
## 21  8053 cases
## 22 51065 cases
## 23 54803 cases
## 24  2431 cases
## 25 21756 cases
## 26  8432 cases
## 27 54165 cases
## 28 61309 cases
## 29  2491 cases
## 30 22508 cases
## 31  9022 cases
## 32 59745 cases
## 33 65328 cases
## 34  2463 cases
## 35 24683 cases
## 36 10645 cases
## 37 61829 cases
## 38 65820 cases
## 39  2392 cases
## 40 25246 cases
brazil %>% filter(location_type == "region") %>% 
  ggplot(aes(x = report_date, y = value, group = location, color = location)) + 
  geom_line() +  
  geom_point() +
  ggtitle("Casos de Zika por Regiao do Brasil")

Etapa 4 - Separando as regioes e visualizando os dados

# Separando as Regioes e Visualizando os Dados
region <- brazil %>% 
  filter(location_type == "region")

region %>% 
  ggplot(aes(x =location, y = value)) + geom_bar(stat = "identity") +
  ylab("Numero de Casos Reportados") + xlab("Region") + 
  ggtitle("Casos de Zika Reportados no Brasil")

region %>% 
  slice(1:length(unique(region$location))) %>% 
  arrange(desc(value)) %>%
  mutate(location = factor(location, levels = location,ordered = TRUE)) %>%
  ggplot(aes(x = location, y = value)) + geom_bar(stat = "identity") +
  ylab("Numero de Casos Reportados") + xlab("Region") + 
  ggtitle("Casos de Zika Reportados no Brasil")

# Obtendo localidades unicas
region %>% 
  slice(1:length(unique(region$location)))
## # A tibble: 5 x 7
##   report_date     location location_type    data_field data_field_code
##        <date>        <chr>         <chr>         <chr>           <chr>
## 1  2016-04-02        Norte        region zika_reported          BR0011
## 2  2016-04-02     Nordeste        region zika_reported          BR0011
## 3  2016-04-02      Sudeste        region zika_reported          BR0011
## 4  2016-04-02          Sul        region zika_reported          BR0011
## 5  2016-04-02 Centro-Oeste        region zika_reported          BR0011
## # ... with 2 more variables: value <int>, unit <chr>
# Organziando as localidades unicas por numero de casos reportados
region %>% 
  slice(1:length(unique(region$location))) %>% 
  arrange(desc(value))
## # A tibble: 5 x 7
##   report_date     location location_type    data_field data_field_code
##        <date>        <chr>         <chr>         <chr>           <chr>
## 1  2016-04-02      Sudeste        region zika_reported          BR0011
## 2  2016-04-02     Nordeste        region zika_reported          BR0011
## 3  2016-04-02 Centro-Oeste        region zika_reported          BR0011
## 4  2016-04-02        Norte        region zika_reported          BR0011
## 5  2016-04-02          Sul        region zika_reported          BR0011
## # ... with 2 more variables: value <int>, unit <chr>
# Criando variaveis do tipo fator
region %>% 
  slice(1:length(unique(region$location))) %>% 
  arrange(desc(value)) %>%
  mutate(location = factor(location,levels=location,ordered=TRUE)) %>% 
  glimpse()
## Observations: 5
## Variables: 7
## $ report_date     <date> 2016-04-02, 2016-04-02, 2016-04-02, 2016-04-0...
## $ location        <ord> Sudeste, Nordeste, Centro-Oeste, Norte, Sul
## $ location_type   <chr> "region", "region", "region", "region", "region"
## $ data_field      <chr> "zika_reported", "zika_reported", "zika_report...
## $ data_field_code <chr> "BR0011", "BR0011", "BR0011", "BR0011", "BR0011"
## $ value           <int> 35505, 30286, 17504, 6295, 1797
## $ unit            <chr> "cases", "cases", "cases", "cases", "cases"

Etapa 5 - Agrupando e sumarizando

# Agrupando o Sumarizando
brazil_totals <- brazil %>% filter(location=="Brazil") 
region_totals <- brazil %>% filter(location_type=="region") %>%
  group_by(report_date,location) %>%  
  summarize(tot = sum(value)) 

# Padronizar os dados e remover as sumarizacoes
regvec <- vector()  
length(regvec) <- nrow(brazil)
for (ii in 1:nrow(brazil)) {
  if (brazil[ii,]$location_type != "region")  {
    regvec[ii] <- newlab
  } else {
    newlab <- brazil[ii,]$location
    regvec[ii] <- newlab
  }
}

# Agregando o vetor de regioes ao dataframe brasil
statedf <- cbind(brazil,regvec)

# Eliminar o sumario de linhas por regiao e pais
statedf <- statedf %>% filter(location != "Brazil") 
statedf <- statedf %>% filter(location_type != "region") 

Etapa 6 - Gerar o total por regioes a partir dos dados transformados

# Gerar o total por regioes a partir dos dados transformados
statedf %>% group_by(report_date,regvec) %>% 
  summarize(tot=sum(value)) -> totals

# Gerando os mapas de cada estado do Brasil
#install.packages('ggmap')
library(ggmap)
## Warning: package 'ggmap' was built under R version 3.4.2
longlat <- geocode(unique(statedf$location)) %>% 
  mutate(loc = unique(statedf$location)) 
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Brazil-Rondonia&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Brazil-Acre&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Brazil-Amazonas&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Brazil-Roraima&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Brazil-Para&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Brazil-Amapa&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Brazil-Tocantins&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Brazil-Maranhao&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Brazil-Piaui&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Brazil-Ceara&sensor=false
## .Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Brazil-Rio_Grande_do_Norte&sensor=false
## .Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Brazil-Paraiba&sensor=false
## Warning: geocode failed with status OVER_QUERY_LIMIT, location = "Brazil-
## Paraiba"
## .Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Brazil-Pernambuco&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Brazil-Alagoas&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Brazil-Sergipe&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Brazil-Bahia&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Brazil-Minas_Gerais&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Brazil-Espirito_Santo&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Brazil-Rio_de_Janeiro&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Brazil-Sao_Paulo&sensor=false
## .Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Brazil-Parana&sensor=false
## .Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Brazil-Santa_Catarina&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Brazil-Rio_Grande_do_Sul&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Brazil-Mato_Grosso_do_Sul&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Brazil-Mato_Grosso&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Brazil-Goias&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Brazil-Distrito_Federal&sensor=false
# Salvando os geocodes do dataframe statedf e salvando em um novo dataframe chamado formapping
statedf %>% filter(as.character(report_date) == "2016-06-11") %>% 
  group_by(location) %>% summarize(cases = sum(value)) %>% 
  inner_join(longlat, by = c("location" = "loc")) %>% 
  mutate(LatLon = paste(lat, lon, sep = ":")) -> formapping

# Visualizando os dados
head(formapping) 
## # A tibble: 6 x 5
##          location cases       lon         lat                 LatLon
##             <chr> <int>     <dbl>       <dbl>                  <chr>
## 1     Brazil-Acre   846 -70.81200  -9.0237964 -9.0237964:-70.8119953
## 2  Brazil-Alagoas  3847 -36.78195  -9.5713058 -9.5713058:-36.7819505
## 3    Brazil-Amapa   189 -52.00296   0.9019925  0.9019925:-52.0029565
## 4 Brazil-Amazonas  3713 -65.85606  -3.4168427 -3.4168427:-65.8560646
## 5    Brazil-Bahia 46427 -41.70073 -12.5797380 -12.579738:-41.7007272
## 6    Brazil-Ceara  2358 -39.32062  -5.4983977 -5.4983977:-39.3206241
# Formatando a saida e gerando um movo dataframe chamado long_formapping
num_of_times_to_repeat <- formapping$cases
long_formapping <- formapping[rep(seq_len(nrow(formapping)),
                                  num_of_times_to_repeat),]

# Visualizando os dados
head(long_formapping)
## # A tibble: 6 x 5
##      location cases     lon       lat                 LatLon
##         <chr> <int>   <dbl>     <dbl>                  <chr>
## 1 Brazil-Acre   846 -70.812 -9.023796 -9.0237964:-70.8119953
## 2 Brazil-Acre   846 -70.812 -9.023796 -9.0237964:-70.8119953
## 3 Brazil-Acre   846 -70.812 -9.023796 -9.0237964:-70.8119953
## 4 Brazil-Acre   846 -70.812 -9.023796 -9.0237964:-70.8119953
## 5 Brazil-Acre   846 -70.812 -9.023796 -9.0237964:-70.8119953
## 6 Brazil-Acre   846 -70.812 -9.023796 -9.0237964:-70.8119953

Etapa 7 - Gerando o Mapa do Brasil com a Ocorrencia do Virus Zika

# Instalando o pacote leaflet
# install.packages("leaflet")
library(leaflet)
## Warning: package 'leaflet' was built under R version 3.4.2
# Gerando o mapa com o dataframe
# Aplique o zoom
leaflet(long_formapping) %>% 
  addTiles() %>% 
  addMarkers(clusterOptions = markerClusterOptions())
## Assuming 'lon' and 'lat' are longitude and latitude, respectively
## Warning in validateCoords(lng, lat, funcName): Data contains 2889 rows with
## either missing or invalid lat/lon values and will be ignored